The goal of this analysis is to visualize motif affinity effects on single motifs and highlight the directional impact of pioneering TFs. We will do this by visualizing experimental results, marginalized predictions, and perturbations of in vivo sequences, predicted by our models.
#Standard packages
library(rtracklayer) ; library(GenomicRanges); library(magrittr) ; library(Biostrings)
library(ggplot2) ; library(reshape2); library(plyranges); library(Rsamtools); library(parallel)
library(dplyr); library(data.table); library(patchwork); library(readr); library(testit)
#KNITR Options
setwd("/n/projects/mw2098/publications/2024_weilert_acc/code/2_analysis/")
figure_filepath<-"figures/11_motif_affinity"
options(knitr.figure_dir=figure_filepath, java.parameters = "- Xmx6g")
#Lab sources
source("scripts/r/granges_common.r")
source("scripts/r/metapeak_common.r")
source("scripts/r/knitr_common.r")
source("scripts/r/caching.r")
source("scripts/r/metapeak_functions.r")
#Specific sources
library(BSgenome.Mmusculus.UCSC.mm10)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(ggseqlogo)
source("scripts/r/motif_functions.r")
#Pre-existing variables
threads <- 5
bsgenome<-BSgenome.Mmusculus.UCSC.mm10
txdb<-TxDb.Mmusculus.UCSC.mm10.knownGene
#model information
tasks<-c('oct4', 'sox2', 'nanog', 'klf4', 'zic3')
motif_to_task.list<-list('Oct4-Sox2' = 'oct4', 'Oct4' = 'oct4', 'Sox2' = 'sox2', 'Klf4' = 'klf4', 'Zic3' = 'zic3', 'Nanog' = 'nanog')
motif_to_task.df<-data.frame(task = motif_to_task.list %>% unlist, motif = names(motif_to_task.list))
input_length<-2032
output_length<-1000
flank_length<-(input_length - output_length)/2
#experimental data information
motifs.path<-'tsv/mapped_motifs/all_instances_curated_0based_w_perturb.tsv.gz'
regions.path<-'tsv/mapped_motifs/all_islands_curated_1based_w_experimental_data.tsv.gz'
marg.path<-'tsv/insilico/marginalizations/motifs/all_marginalizations.tsv.gz'
showcase.path<-'tsv/insilico/marginalizations/motifs/all_showcase_profiles.tsv.gz'
cov.bw.list<-list(zic3 = list(pos = 'bw/mesc_zic3_nexus_combined_normalized_positive.bw',
neg = 'bw/mesc_zic3_nexus_combined_normalized_negative.bw'),
oct4 = list(pos = 'bw/mesc_oct4_nexus_combined_normalized_positive.bw',
neg = 'bw/mesc_oct4_nexus_combined_normalized_negative.bw'),
sox2 = list(pos = 'bw/mesc_sox2_nexus_combined_normalized_positive.bw',
neg = 'bw/mesc_sox2_nexus_combined_normalized_negative.bw'),
klf4 = list(pos = 'bw/mesc_klf4_nexus_combined_normalized_positive.bw',
neg = 'bw/mesc_klf4_nexus_combined_normalized_negative.bw'),
nanog = list(pos = 'bw/mesc_nanog_nexus_combined_normalized_positive.bw',
neg = 'bw/mesc_nanog_nexus_combined_normalized_negative.bw'),
atac = list(pos = 'bw/mesc_native_atac_cutsites_combined.bw',
neg = 'bw/mesc_native_atac_cutsites_combined.bw'))
Read in motifs with perturbation scores
motifs.df<-readr::read_tsv(motifs.path)
regions.df<-readr::read_tsv(regions.path)
Import marginalization scores and their associated sequences.
marg.df<-readr::read_tsv(marg.path)
Join marginalization and perturbation scores with motifs.
motifs.df<-motifs.df %>% dplyr::left_join(marg.df) %>% as.data.frame
Compare marginalization scores relative to accessibility and binding models. First, assign task-specific values.
motifs.df<-lapply(names(motif_to_task.list), function(x){
motif.df<-motifs.df %>% dplyr::filter(motif==x)
motif.df$bind_marg<-motif.df[[paste0(motif_to_task.list[[x]], '_marg')]]
motif.df$acc_marg<-motif.df[['atac_wt_marg']]
motif.df$bind_perturb<-log(motif.df[[paste0('bpnet_osknz/', motif_to_task.list[[x]], '/WT/pred_sum')]]) -
log(motif.df[[paste0('bpnet_osknz/', motif_to_task.list[[x]], '/dA/pred_sum')]])
motif.df$acc_perturb<-log(motif.df$`atac_wt/atac/WT/pred_sum`) - log(motif.df$`atac_wt/atac/dA/pred_sum`)
motif.df<-motif.df %>% dplyr::left_join(., regions.df %>%
dplyr::select(region_id, atac_obs, atac_pred, !!sym(motif_to_task.list[[x]]), !!sym(paste0(motif_to_task.list[[x]], '_pred')))) %>%
dplyr::rename(bind_obs = !!sym(motif_to_task.list[[x]]),
bind_pred =!!sym(paste0(motif_to_task.list[[x]], '_pred')))
return(motif.df)
}) %>% rbindlist()
Check the fold-change difference between top5% and bottom5% of each motif
motifs.df %>%
dplyr::group_by(motif) %>%
dplyr::summarize(bind_marg_bottom5 = quantile(bind_marg, .05),
bind_marg_top5 = quantile(bind_marg, .95),
bind_fold_change = bind_marg_top5/bind_marg_bottom5,
acc_marg_bottom5 = quantile(acc_marg, .05),
acc_marg_top5 = quantile(acc_marg, .95),
acc_fold_change = acc_marg_top5/acc_marg_bottom5) %>%
as.data.frame
## motif bind_marg_bottom5 bind_marg_top5 bind_fold_change acc_marg_bottom5
## 1 Klf4 0.04760500 0.4929750 10.355530 0.05587480
## 2 Nanog -0.00583240 0.0799336 -13.705096 -0.02989560
## 3 Oct4 0.03628031 0.2442613 6.732613 0.07417056
## 4 Oct4-Sox2 0.02967745 0.7867802 26.511044 0.02020980
## 5 Sox2 0.01598850 0.1574055 9.844920 0.02254580
## 6 Zic3 0.03376580 0.7665943 22.703277 -0.00849760
## acc_marg_top5 acc_fold_change
## 1 0.3204094 5.734417
## 2 0.0349107 -1.167754
## 3 0.2873913 3.874736
## 4 0.8303845 41.088211
## 5 0.1516709 6.727235
## 6 0.0895820 -10.542035
Plot observed scores relative to accessibility and binding models.
#Plot relationship between binding and accessibility
bind_vs_acc_obs.plot.list<-lapply(names(motif_to_task.list), function(x){
motif.df<-motifs.df %>% dplyr::filter(motif==x)
g<-ggplot(motif.df, aes(x = log(bind_obs), y = log(atac_obs)))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding obs.', limits = c(-2.5, 6))+
scale_y_continuous(name = 'Acc obs.', limits = c(0, 10))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
return(g)
})
bind_vs_acc_obs.plot<-wrap_plots(bind_vs_acc_obs.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/bind_obs_vs_acc_obs_vs_ic_scatter.png'), bind_vs_acc_obs.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/bind_obs_vs_acc_obs_vs_ic_scatter.pdf'), bind_vs_acc_obs.plot, height = 4, width = 20)
Plot predicted scores relative to accessibility and binding models.
#Plot relationship between binding and accessibility
bind_vs_acc_pred.plot.list<-lapply(names(motif_to_task.list), function(x){
motif.df<-motifs.df %>% dplyr::filter(motif==x)
g<-ggplot(motif.df, aes(x = log(bind_pred), y = log(atac_pred)))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding pred.', limits = c(1, 12))+
scale_y_continuous(name = 'Acc pred.', limits = c(2, 10))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
return(g)
})
bind_vs_acc_pred.plot<-wrap_plots(bind_vs_acc_pred.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/bind_pred_vs_acc_pred_vs_ic_scatter.png'), bind_vs_acc_pred.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/bind_pred_vs_acc_pred_vs_ic_scatter.pdf'), bind_vs_acc_pred.plot, height = 4, width = 20)
Plot marginalization scores relative to accessibility and binding models.
#Plot relationship between binding and accessibility
bind_vs_acc_marg.plot.list<-lapply(names(motif_to_task.list), function(x){
motif.df<-motifs.df %>% dplyr::filter(motif==x)
if (x=='Sox2'){
#Aesthetic zoom for nice figure formation
g_zoom<-ggplot(motif.df, aes(x = bind_marg, y = acc_marg))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = 1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding marg.', limits = c(-.1, .25))+
scale_y_continuous(name = 'Acc marg.', limits = c(-.1, .25))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
ggsave(paste0(figure_filepath, '/bind_marg_vs_acc_marg_vs_ic_scatter_sox2_zoom.png'), g_zoom, height = 4, width = 4)
ggsave(paste0(figure_filepath, '/bind_marg_vs_acc_marg_vs_ic_scatter_sox2_zoom.pdf'), g_zoom, height = 4, width = 4)
}
g<-ggplot(motif.df, aes(x = bind_marg, y = acc_marg))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding marg.', limits = c(-1, 4))+
scale_y_continuous(name = 'Acc marg.', limits = c(-1, 3))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
return(g)
})
bind_vs_acc_marg.plot<-wrap_plots(bind_vs_acc_marg.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/bind_marg_vs_acc_marg_vs_ic_scatter.png'), bind_vs_acc_marg.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/bind_marg_vs_acc_marg_vs_ic_scatter.pdf'), bind_vs_acc_marg.plot, height = 4, width = 20)
Plot binding contribution scores with the accessibility contribution scores.
#Plot relationship between binding and accessibility
bind_vs_acc_contrib.plot.list<-lapply(names(motif_to_task.list), function(x){
motif.df<-motifs.df %>% dplyr::filter(motif==x)
g<-ggplot(motif.df, aes(x = bind_contrib, y = acc_contrib))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding contrib.', limits = c(-.5, 2))+
scale_y_continuous(name = 'Acc contrib.', limits = c(-.5, 2))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
return(g)
})
bind_vs_acc_contrib.plot<-wrap_plots(bind_vs_acc_contrib.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/bind_contrib_vs_acc_contrib_vs_ic_scatter.png'), bind_vs_acc_contrib.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/bind_contrib_vs_acc_contrib_vs_ic_scatter.pdf'), bind_vs_acc_contrib.plot, height = 4, width = 20)
Plot binding marginalization scores relative to PWM scores
#Plot relationship between binding and accessibility
bind_vs_ic_marg.plot.list<-lapply(names(motif_to_task.list), function(x){
motif.df<-motifs.df %>% dplyr::filter(motif==x)
g<-ggplot(motif.df, aes(x = bind_marg, y = seq_match_quantile))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
# scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding marg.')+
scale_y_continuous(name = 'PWM score quantile', limits = c(0, 1))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
return(g)
})
bind_vs_ic_marg.plot<-wrap_plots(bind_vs_ic_marg.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/bind_marg_vs_ic_scatter.png'), bind_vs_ic_marg.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/bind_marg_vs_ic_scatter.pdf'), bind_vs_ic_marg.plot, height = 4, width = 20)
Compare genomic perturbation scores relative to accessibility and binding models.
bind_vs_acc_genomic.plot.list<-lapply(names(motif_to_task.list), function(x){
g<-ggplot(motifs.df %>% dplyr::filter(motif==x), aes(x = bind_perturb, y = acc_perturb))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Binding perturb. log(WT/dA)', limits = c(-1, 4))+
scale_y_continuous(name = 'Acc perturb. log(WT/dA)', limits = c(-1, 3))+
ggtitle(x)+
theme_classic() + theme(legend.position = 'bottom')
return(g)
})
bind_vs_acc_genomic.plot<-wrap_plots(bind_vs_acc_genomic.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/bind_perturb_vs_acc_perturb_vs_ic_scatter.png'), bind_vs_acc_genomic.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/bind_perturb_vs_acc_perturb_vs_ic_scatter.pdf'), bind_vs_acc_genomic.plot, height = 4, width = 20)
bind_vs_acc.plot<-c(bind_vs_acc_obs.plot.list, bind_vs_acc_pred.plot.list, bind_vs_acc_genomic.plot.list, bind_vs_acc_marg.plot.list) %>% wrap_plots(., nrow = 4, ncol = 6)
ggsave(paste0(figure_filepath, '/bind_vs_acc_vs_ic_scatter.png'), bind_vs_acc.plot, height = 16, width = 20)
ggsave(paste0(figure_filepath, '/bind_vs_acc_vs_ic_scatter.pdf'), bind_vs_acc.plot, height = 16, width = 20)
These plots give us an idea of what the motif relationships look like.
motif_summary.df<-motifs.df %>%
dplyr::group_by(motif, seq) %>%
#First, take median to get standard response across each individual sequence (i.e. ACAA is not that hard to get more than 1x)
dplyr::summarize(bind_marg = median(bind_marg),
bind_perturb = median(bind_perturb),
acc_marg = median(acc_marg),
acc_perturb = median(acc_perturb)) %>%
#Then, take median across each individual sequences, then summarize
dplyr::group_by(motif) %>%
dplyr::summarize(med_bind_marg = median(bind_marg),
med_bind_perturb = median(bind_perturb),
med_acc_marg = median(acc_marg),
med_acc_perturb = median(acc_perturb)) %>%
as.data.table %>%
melt.data.table(id.vars = 'motif') %>%
tidyr::separate(col = variable, into = c(NA, 'model', 'pred_type')) %>%
dplyr::mutate(motif = factor(motif, names(motif_to_task.list)),
pred_type = factor(pred_type, c('perturb', 'marg')))
summary.plot<-ggplot(motif_summary.df, aes(x = motif, y = value, fill = pred_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_grid(. ~ model)+
theme_classic() + theme(legend.position = 'bottom')
ggsave(paste0(figure_filepath, '/motif_response_summary.png'), summary.plot, height = 4, width = 4)
ggsave(paste0(figure_filepath, '/motif_response_summary.pdf'), summary.plot, height = 4, width = 4)
How does the contrast between isolation scores and context scores influence the motif that we are interested in performing CRISPR mutations on? Low-affinity Oct4-Sox2 at Btbd11 locus.
btbd11_os.df<-motifs.df %>% dplyr::filter(motif_id==139897) %>%
dplyr::select(bind_marg, bind_perturb, acc_marg, acc_perturb, motif) %>%
melt.data.table(id.vars = 'motif') %>%
tidyr::separate(col = variable, into = c('model', 'pred_type')) %>%
dplyr::mutate(motif = factor(motif, names(motif_to_task.list)),
pred_type = factor(pred_type, c('perturb', 'marg')))
btbd11_os.plot<-ggplot(btbd11_os.df %>% dplyr::filter(), aes(x = motif, y = value, fill = pred_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_grid(. ~ model)+
theme_classic() + theme(legend.position = 'bottom')
ggsave(paste0(figure_filepath, '/motif_response_at_btbd11_os.png'), btbd11_os.plot, height = 4, width = 4)
ggsave(paste0(figure_filepath, '/motif_response_at_btbd11_os.pdf'), btbd11_os.plot, height = 4, width = 4)
In this paper they characterize the role of 3 lower-affinity OS motifs at their respective E1-3 enhancers. Do we map them? Yes! How can we validate our models based on these external results?
motif_id_to_klf4_enhancer.df<-data.frame(
motif_id = c(132198, 126751, 126750, 126749),
enhancer = factor(c('nanog_upstream_2', 'klf4_e1','klf4_e2','klf4_e3'),
levels = c('nanog_upstream_2', 'klf4_e1','klf4_e2','klf4_e3'))
)
#Recapitulate F2B-C where they show relative contributions of Oct4-Sox2 motifs
os_motifs_of_interest.df<-motifs.df %>% dplyr::filter(motif_id %in% motif_id_to_klf4_enhancer.df$motif_id) %>%
dplyr::left_join(motif_id_to_klf4_enhancer.df)
os_motifs_of_interest.plot<-os_motifs_of_interest.df %>%
dplyr::select(enhancer, bind_perturb, acc_perturb) %>%
tidyr::pivot_longer(cols = c(bind_perturb, acc_perturb), names_to = 'model', values_to = 'perturb') %>%
ggplot(., aes(x = enhancer, y = perturb, fill = model))+
geom_bar(stat = 'identity')+facet_grid(. ~ model)
ggsave(paste0(figure_filepath, '/motif_response_at_klf4_enhancers.png'), os_motifs_of_interest.plot, height = 4, width = 8)
ggsave(paste0(figure_filepath, '/motif_response_at_klf4_enhancers.pdf'), os_motifs_of_interest.plot, height = 4, width = 8)
affinity_corr.df<-motifs.df %>%
dplyr::group_by(motif) %>%
dplyr::mutate(motif = factor(motif, levels = names(motif_to_task.list))) %>%
dplyr::summarize(
obs_atac_vs_seq_q = cor(atac_obs, seq_match_quantile, method = 'spearman'),
obs_bind_vs_obs_atac = cor(bind_obs, atac_obs, method = 'spearman'),
obs_bind_vs_seq_q = cor(bind_obs, seq_match_quantile, method = 'spearman'),
pred_atac_vs_seq_q = cor(atac_pred, seq_match_quantile, method = 'spearman'),
pred_bind_vs_pred_atac = cor(bind_pred, atac_pred, method = 'spearman'),
pred_bind_vs_seq_q = cor(bind_pred, seq_match_quantile, method = 'spearman'),
perturb_acc_vs_seq_q = cor(acc_perturb, seq_match_quantile, method = 'spearman'),
perturb_bind_vs_perturb_acc = cor(bind_perturb, acc_perturb, method = 'spearman'),
perturb_bind_vs_seq_q = cor(bind_perturb, seq_match_quantile, method = 'spearman'),
marg_acc_vs_seq_q = cor(acc_marg, seq_match_quantile, method = 'spearman'),
marg_bind_vs_marg_acc = cor(bind_marg, acc_marg, method = 'spearman'),
marg_bind_vs_seq_q = cor(bind_marg, seq_match_quantile, method = 'spearman'),
acc_contrib_vs_seq_q = cor(acc_contrib, seq_match_quantile, method = 'spearman'),
bind_contrib_vs_acc_contrib = cor(bind_contrib, acc_contrib, method = 'spearman'),
bind_contrib_vs_seq_q = cor(bind_contrib, seq_match_quantile, method = 'spearman')
)
affinity_corr.df
## # A tibble: 6 × 16
## motif obs_atac_vs_seq_q obs_bind_vs_obs_atac obs_bind_vs_seq_q
## <fct> <dbl> <dbl> <dbl>
## 1 Oct4-Sox2 -0.0198 0.794 0.0694
## 2 Oct4 0.187 0.806 0.105
## 3 Sox2 -0.0758 0.737 -0.0655
## 4 Klf4 -0.123 0.741 0.00463
## 5 Zic3 -0.138 0.500 0.102
## 6 Nanog -0.0953 0.536 -0.124
## # ℹ 12 more variables: pred_atac_vs_seq_q <dbl>, pred_bind_vs_pred_atac <dbl>,
## # pred_bind_vs_seq_q <dbl>, perturb_acc_vs_seq_q <dbl>,
## # perturb_bind_vs_perturb_acc <dbl>, perturb_bind_vs_seq_q <dbl>,
## # marg_acc_vs_seq_q <dbl>, marg_bind_vs_marg_acc <dbl>,
## # marg_bind_vs_seq_q <dbl>, acc_contrib_vs_seq_q <dbl>,
## # bind_contrib_vs_acc_contrib <dbl>, bind_contrib_vs_seq_q <dbl>
readr::write_tsv(affinity_corr.df, 'tsv/affinity_vs_bind_vs_acc_cor.tsv')
In order to assess whether our marginalization scores are an accurate reflection of in vitro TF binding affinity, we will compare them to the closest published PBMs in the UniProbe database.
pbm_data.list<-list(
'sox2_human_rep1' = readr::read_tsv('../../public/databases/UniProbe/SOX2_anti-GST_rep1/SOX2_anti-GST_rep1_8mers_11111111.txt'),
'sox2_human_rep2' = readr::read_tsv('../../public/databases/UniProbe/SOX2_anti-GST_rep2/SOX2_anti-GST_rep2_8mers_11111111.txt'),
'pou5f1_human_rep1' = readr::read_tsv('../../public/databases/UniProbe/POU5F1_anti-GST_rep1/POU5F1_anti-GST_rep1_8mers_11111111.txt'),
'pou5f1_human_rep2' = readr::read_tsv('../../public/databases/UniProbe/POU5F1_anti-GST_rep2/POU5F1_anti-GST_rep2_8mers_11111111.txt'),
'klf1_mouse_rep1' = readr::read_tsv('../../public/databases/UniProbe/Klf1/Klf1_8mers_11111111.txt'),
'zic3_mouse_rep1' = readr::read_tsv('../../public/databases/UniProbe/Zic3/3119.2_v1/Zic3_3119.2_v1_contig8mers.txt'),
'zic3_mouse_rep2' = readr::read_tsv('../../public/databases/UniProbe/Zic3/3119.2_v2/Zic3_3119.2_v2_contig8mers.txt')
)
pbm_data.list<-lapply(pbm_data.list, function(x) {colnames(x)<- c('seq_8mer', 'seq_rev_8mer', 'escore', 'med_intensity', 'zscore'); return(x)})
colnames(pbm_data.list$zic3_mouse_rep1)<-c('seq_8mer', 'seq_rev_8mer', 'med_intensity', 'escore', 'zscore', 'filler1', 'filler2','filler3','filler4')
colnames(pbm_data.list$zic3_mouse_rep2)<-c('seq_8mer', 'seq_rev_8mer', 'med_intensity', 'escore', 'zscore', 'filler1', 'filler2','filler3','filler4')
pbm_to_motif.vec<-c('Sox2','Sox2','Oct4-Sox2','Oct4-Sox2','Klf4','Zic3','Zic3')
names(pbm_to_motif.vec)<-names(pbm_data.list)
alignment.list<-list(
'sox2_human_rep1' = motifs.df %>% dplyr::filter(motif=='Sox2') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'start') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'sox2', rep = 1),
'sox2_human_rep2' = motifs.df %>% dplyr::filter(motif=='Sox2') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'start') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'sox2', rep = 2),
'pou5f1_human_rep1' = motifs.df %>% dplyr::filter(motif=='Oct4-Sox2') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'start') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'pou5f1', rep = 1),
'pou5f1_human_rep2' = motifs.df %>% dplyr::filter(motif=='Oct4-Sox2') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'start') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'pou5f1', rep = 2),
'klf1_mouse_rep1' = motifs.df %>% dplyr::filter(motif=='Klf4') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'center') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'klf1', rep = 1),
'zic3_mouse_rep1' = motifs.df %>% dplyr::filter(motif=='Zic3') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'center') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'zic3', rep = 1),
'zic3_mouse_rep2' = motifs.df %>% dplyr::filter(motif=='Zic3') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
GenomicRanges::resize(., width = 8, fix = 'center') %>%
plyranges::mutate(seq = getSeq(bsgenome, ., as.character = T)) %>% as.data.frame %>%
dplyr::select(seq, motif, bind_marg, acc_marg, bind_perturb, acc_perturb, seq_match) %>% dplyr::mutate(tf = 'zic3', rep = 2)
)
pbm_vs_bpnet.df<-mclapply(names(pbm_data.list), function(x){
pbm.df<-pbm_data.list[[x]] %>% arrange(desc(zscore))
cwm.df<-alignment.list[[x]]
#Calculate new IC
pwm<-Biostrings::PWM(cwm.df$seq)
#Double check my math
# cwm.df$pwm_score<-mclapply(cwm.df$seq, function(y) PWMscoreStartingAt(pwm, y), mc.cores = 8) %>% unlist
cwm.df$ic_score<-mclapply(cwm.df$seq, function(y) get_sequence_ic(y, pwm), mc.cores = 8) %>% unlist
cwm_w_zscore.df <- cwm.df %>%
dplyr::left_join(pbm.df %>% dplyr::select(-seq_rev_8mer, seq_8mer, zscore) %>% dplyr::rename(seq = seq_8mer)) %>%
dplyr::left_join(pbm.df %>% dplyr::select(-seq_8mer, seq_rev_8mer, zscore) %>% dplyr::rename(seq = seq_rev_8mer))
}, mc.cores = 7) %>% rbindlist(fill = T)
pbm_vs_bpnet.df<-pbm_vs_bpnet.df %>% dplyr::mutate(tf = factor(tf, levels = c('sox2','klf1','zic3','pou5f1')))
pwm.plot<-ggplot(pbm_vs_bpnet.df %>% dplyr::select(ic_score, zscore, seq, rep, tf) %>% unique(), aes(x = ic_score, y = zscore, color = factor(rep)))+
ggrastr::geom_point_rast(size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top', hjust = 0)+
geom_smooth(method = lm, se = FALSE)+
# scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'PWM score')+
scale_y_continuous(name = 'PBM max Z-score')+
scale_color_manual(values = c('#6a3d9a', '#33a02c'))+
facet_grid(. ~ tf, scales = 'free')+
theme_classic() + theme(legend.position = 'bottom')
isolation.plot<-ggplot(pbm_vs_bpnet.df %>% dplyr::group_by(seq, tf, rep) %>% dplyr::summarize(bind_marg = max(bind_marg), zscore = max(zscore)) %>% dplyr::ungroup() %>%dplyr::select(bind_marg, zscore, seq, rep, tf) %>% unique(),
aes(x = bind_marg, y = zscore, color = factor(rep)))+
ggrastr::geom_point_rast(size = .1)+
geom_smooth(method = lm, se = FALSE)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top', hjust = 0)+
# scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = 'Isolation score')+
scale_y_continuous(name = 'PBM max Z-score')+
scale_color_manual(values = c('#6a3d9a', '#33a02c'))+
facet_grid(. ~ tf, scales = 'free')+
theme_classic() + theme(legend.position = 'bottom')
plot <- pwm.plot + isolation.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/pbm_vs_preds.png'), plot, height = 6, width = 8)
ggsave(paste0(figure_filepath, '/pbm_vs_preds.pdf'), plot, height = 6, width = 8)
In order to validate that the variability of perturbation scores (as compared to marginalization scores) is an actual reflection of overall binding levels, we need to visualize this comparison.
cor.plot<-motifs.df %>%
dplyr::group_by(motif) %>%
# dplyr::slice_max(order_by = seq_match_quantile, n = 10000) %>%
dplyr::summarize(bind_marg_vs_bind_obs = cor(bind_marg, bind_obs, method = 'spearman'),
bind_perturb_vs_bind_obs = cor(bind_perturb, bind_obs, method = 'spearman'),
acc_marg_vs_acc_obs = cor(acc_marg, atac_obs, method = 'spearman'),
acc_perturb_vs_acc_obs = cor(acc_perturb, atac_obs, method = 'spearman')) %>%
dplyr::mutate(motif = factor(motif, levels = motif_to_task.list %>% names() %>% rev)) %>%
tidyr::pivot_longer(cols = c(bind_marg_vs_bind_obs, bind_perturb_vs_bind_obs, acc_marg_vs_acc_obs, acc_perturb_vs_acc_obs), names_to = 'cor_comparison', values_to = 's_cor') %>%
ggplot(., aes(x = motif, y = s_cor, fill = cor_comparison))+
geom_bar(color = 'black', stat = 'identity', position = 'dodge')+
# geom_text(aes(label = round(s_cor, 2)), color = 'black')+
scale_x_discrete(name = 'comparison', expand = c(0,0))+
scale_y_continuous(name = 'S. corr', expand = c(0,0))+
# scale_fill_gradient2(high = '#b2182b', mid = 'white', low = '#2166ac', midpoint = 0, name = 'Spearman/corr')+
theme_classic()
cor.plot
ggsave(paste0(figure_filepath, '/model_pred_vs_obs_corr.png'), cor.plot, height = 4, width = 8)
ggsave(paste0(figure_filepath, '/model_pred_vs_obs_corr.pdf'), cor.plot, height = 4, width = 8)
Compare why perturbation scores are so variable.
count_threshold<-20
motifs_w_peak_dist.df<-motifs.df %>%
dplyr::mutate(peak_dist = abs(pattern_center - 500),
motif = factor(motif, levels = names(motif_to_task.list))) %>%
dplyr::group_by(motif, peak_dist) %>%
dplyr::summarize(med_bind_perturb = median(bind_perturb),
med_acc_perturb = median(acc_perturb),
counts = dplyr::n()) %>%
dplyr::filter(counts>=count_threshold) %>%
tidyr::pivot_longer(cols = c(med_bind_perturb, med_acc_perturb), names_to = 'model', values_to = 'med_perturb')
ggplot(motifs_w_peak_dist.df, aes(x = peak_dist, y = med_perturb, color = model))+
geom_hline(yintercept = 0, linetype = 'dashed')+
geom_line()+
facet_grid(motif ~ .)+
scale_x_continuous(limits = c(1, 300))+
scale_y_continuous(limits = c(-.1, .6))+
theme_classic()
showcase.df<-readr::read_tsv(showcase.path) %>%
as.data.frame() %>%
dplyr::filter(model_name %in% c('atac_wt', 'bpnet_osknz')) %>%
dplyr::mutate(motif = factor(motif, levels = paste0(names(motif_to_task.list), ' motif')),
motif_vs_affinity = paste0(motif, ' ', affinity),
task = factor(task, levels = c('Atac pred.', 'Oct4 pred.', 'Sox2 pred.', 'Klf4 pred.', 'Zic3 pred.', 'Nanog pred.')))
showcase.plot<-ggplot()+
geom_line(data = showcase.df,
mapping = aes(x = position, y = stranded_null_pred, group = channel), color = 'gray', alpha = .5)+
geom_line(data = showcase.df,
mapping = aes(x = position, y = stranded_pred, color = affinity, group = label))+
scale_x_continuous(name = 'Predicted position')+
scale_y_continuous(name = 'Predicted reads')+
scale_color_manual(name = 'Injection type', values = c('#6a3d9a', '#33a02c', '#1f78b4'))+
facet_grid(task ~ motif, scales = 'free_y')+
theme_classic()
showcase.plot
ggsave(paste0(figure_filepath, '/inj_showcase.plot.png'), showcase.plot, height = 10, width = 18)
ggsave(paste0(figure_filepath, '/inj_showcase.plot.pdf'), showcase.plot, height = 10, width = 18)
Showcase marginalization scores without Oct4-Sox2.
showcase.plot<-ggplot()+
geom_line(data = showcase.df %>% dplyr::filter(motif != 'Oct4-Sox2 motif'),
mapping = aes(x = position, y = stranded_null_pred, group = channel), color = 'gray', alpha = .5)+
geom_line(data = showcase.df %>% dplyr::filter(motif != 'Oct4-Sox2 motif'),
mapping = aes(x = position, y = stranded_pred, color = affinity, group = label))+
scale_x_continuous(name = 'Predicted position')+
scale_y_continuous(name = 'Predicted reads')+
scale_color_manual(name = 'Injection type', values = c('#6a3d9a', '#33a02c', '#1f78b4'))+
facet_grid(task ~ motif, scales = 'free_y')+
theme_classic()
showcase.plot
ggsave(paste0(figure_filepath, '/inj_showcase_no_os.plot.png'), showcase.plot, height = 10, width = 15)
ggsave(paste0(figure_filepath, '/inj_showcase_no_os.plot.pdf'), showcase.plot, height = 10, width = 15)
Taking the median isolation scores and all corresponding motifs that match those sequences, plot a metapeak so people can see footprinting profiles.
First, import marginalization scores.
marg_med.df<-readr::read_tsv('tsv/insilico/marginalizations/motifs/all_median_profiles.tsv.gz') %>%
dplyr::filter(model_name %in% c('bpnet_osknz', 'atac_wt')) %>%
dplyr::mutate(null_pred = ifelse(channel == 0, null_pred, null_pred * (-1)),
marg_pred = ifelse(channel == 0, pred, pred * (-1))) %>%
dplyr::group_by(position, channel, task, motif, model_name) %>%
dplyr::summarize(med_pred = median(marg_pred), med_ref = median(null_pred)) %>%
dplyr::mutate(position_wrt_motif = position - output_length/2)
Next, import binding perturbation scores and correct coordinates.
context.df<-c('tsv/genomic/motifs/genomic_pertubs_median_motifs_formatted_bpnet_osknz.tsv.gz',
'tsv/genomic/motifs/genomic_pertubs_median_motifs_formatted_atac_wt.tsv.gz') %>%
lapply(., function(x){
message(x)
#Import perturbations
perturb_all.df<-readr::read_tsv(x)
#Import motifs and find relative coordinates and names
perturb_motifs.df<-readr::read_tsv('tsv/genomic/motifs/medians_formatted_for_genomic_perturbations_0based.tsv.gz') %>%
dplyr::group_by(region_id, motif) %>%
dplyr::mutate(motif_num = dplyr::row_number(),
pattern_center = floor((motif_window_end - motif_window_start)/2) + motif_window_start - flank_length) %>%
dplyr::ungroup() %>%
dplyr::mutate(pattern_name_unique = paste0(motif, "-", motif_num)) %>% as.data.frame()
#Align motif coordinates to neutral output prediction indexes
perturb_ref.df<-perturb_all.df %>% dplyr::filter(mut=='Reference')
perturb_mut.df<-perturb_all.df %>% dplyr::filter(mut!='Reference')
rm(perturb_all.df)
#Grab perturbations that are collected around the motifs, attach WT profile
perturb_summary.df<-perturb_mut.df %>%
dplyr::left_join(perturb_motifs.df %>%
dplyr::select(region_id, pattern_center, pattern_name_unique) %>%
dplyr::rename(mut = pattern_name_unique)) %>%
dplyr::mutate(position_wrt_motif = window_position - pattern_center) %>%
dplyr::group_by(region_id, mut) %>%
dplyr::mutate(downstream_max = max(position_wrt_motif),
upstream_min = min(position_wrt_motif)) %>%
dplyr::ungroup() %>%
dplyr::filter(downstream_max>200, upstream_min < (-200), abs(position_wrt_motif)<200) %>% #plot a 400bp window, filter any motifs that don't make the cut
dplyr::left_join(perturb_ref.df %>% dplyr::select(region_id, pred, window_position, model_name, task, channel) %>% dplyr::rename(ref_pred = pred)) %>%
dplyr::mutate(mut = gsub('Oct4-Sox2', 'Oct4Sox2', mut)) %>%
tidyr::separate(mut, into = c('motif', NA), sep = '-') %>%
#Summarize results and plot profiles.
dplyr::group_by(motif, position_wrt_motif, task, channel) %>%
dplyr::summarize(med_pred = median(pred), med_ref = median(ref_pred)) %>%
dplyr::mutate(med_pred = ifelse(channel %in% c('pos', 'atac'), med_pred, med_pred * (-1)),
med_ref = ifelse(channel %in% c('pos', 'atac'), med_ref, med_ref * (-1)))
return(perturb_summary.df)
}) %>% rbindlist()
Plot the motifs effects with marginalization side by side
interp_all.df<-rbind(
marg_med.df %>% dplyr::mutate(task = gsub(' pred.', '', task) %>% tolower(.),
motif = gsub(' motif', '', motif),
channel = as.character(channel)) %>%
dplyr::mutate(interp_type = 'marg', interp_score = med_pred - med_ref),
context.df %>% dplyr::mutate(interp_type = 'context', interp_score = med_ref - med_pred)
) %>% dplyr::left_join(motif_to_task.df %>% dplyr::add_row(task = 'oct4', motif = 'Oct4Sox2') %>% dplyr::rename(intended_task = task)) %>%
dplyr::filter((task==intended_task) | (task=='atac'))
overlaid_context_vs_marg.plot<-ggplot(interp_all.df, aes(x = position_wrt_motif, y = interp_score, group = channel, color = interp_type))+
geom_line()+
scale_x_continuous(limits = c(-200, 200))+
facet_grid(task ~ motif, scales = 'free')+
theme_classic()
ggsave(paste0(figure_filepath, '/marg_vs_perturb_profiles.png'), overlaid_context_vs_marg.plot, height = 14, width = 10)
ggsave(paste0(figure_filepath, '/marg_vs_perturb_profiles.pdf'), overlaid_context_vs_marg.plot, height = 14, width = 10)
Compare marginalization scores relative to accessibility and binding models.
tf_at_os_motifs.plot.list<-lapply(names(motif_to_task.list), function(x){
tf_marg_variable<-paste0(motif_to_task.list[[x]], '_marg')
tf_at_os_motifs.plot<-motifs.df %>% dplyr::filter(motif=='Oct4-Sox2') %>%
ggplot(., aes(x = !!sym(tf_marg_variable), y = acc_marg))+
geom_hline(yintercept = 0, linetype = 'dotted')+
geom_vline(xintercept = 0, linetype = 'dotted')+
ggrastr::geom_point_rast(aes(color = seq_match_quantile), size = .1)+
ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
scale_x_continuous(name = tf_marg_variable, limits = c(-.25, max(motifs.df$bind_marg)))+
scale_y_continuous(name = 'Acc marg.', limits = c(-.25, max(motifs.df$acc_marg)))+
ggtitle(paste0(tf_marg_variable, ' vs acc marg at OS motifs'))+
theme_classic() + theme(legend.position = 'bottom')
return(tf_at_os_motifs.plot)
})
tf_at_os_motifs.plot<-wrap_plots(tf_at_os_motifs.plot.list, nrow = 1)
ggsave(paste0(figure_filepath, '/marg_scores_over_os_motifs_across_other_tfs_vs_ic_scatter.png'), tf_at_os_motifs.plot, height = 4, width = 20)
ggsave(paste0(figure_filepath, '/marg_scores_over_os_motifs_across_other_tfs_vs_ic_scatter.pdf'), tf_at_os_motifs.plot, height = 4, width = 20)
isolated_os_motifs.gr<-motifs.df %>%
dplyr::left_join(., regions.df %>% dplyr::select(region_id, island_content)) %>%
dplyr::filter(motif=='Oct4-Sox2', island_content=='Oct4-Sox2:1') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end',
strand.field = 'strand', starts.in.df.are.0based = T) %>%
plyranges::filter(!overlapsAny(., rtracklayer::import('narrowpeak/mesc_nanog_nexus_peaks.bed'))) %>%
GenomicRanges::resize(., 1, 'center')
nonpeak_of_nanog.gr<-rtracklayer::import('narrowpeak/mesc_atac_non_peaks.narrowPeak') %>%
sample(size = isolated_os_motifs.gr %>%
dplyr::filter(motif=='Oct4-Sox2') %>%
dplyr::filter(island_content=='Oct4-Sox2:1') %>% length(), replace = F) %>%
GenomicRanges::resize(., 1, 'center')
cov_across_os.df<-lapply(c('oct4','sox2','klf4','nanog','zic3'), function(x){
message(x)
#Filter to not contain any peak of TF that you're plotting across, avoiding direct profiles
filtered.gr<-isolated_os_motifs.gr %>%
plyranges::filter(!overlapsAny(., rtracklayer::import(paste0('narrowpeak/mesc_', x, '_nexus_peaks.bed'))))
cov_across_os_high.df<-filtered.gr %>% plyranges::filter(seq_match_quantile>.5) %>%
exo_metapeak(., sample = cov.bw.list[[x]], upstream = 150, downstream = 151) %>%
dplyr::mutate(type = 'cov across OS:1 high',
tf = x)
cov_across_os_low.df<-filtered.gr %>% plyranges::filter(seq_match_quantile<=.5) %>%
exo_metapeak(., sample = cov.bw.list[[x]], upstream = 150, downstream = 151) %>%
dplyr::mutate(type = 'cov across OS:1 low',
tf = x)
cov_across_nonpeak.df<- nonpeak_of_nanog.gr%>%
exo_metapeak(., sample = cov.bw.list[[x]], upstream = 150, downstream = 151) %>%
dplyr::mutate(type = 'cov across nonpeak',
tf = x)
df<-rbind(cov_across_os_high.df, cov_across_os_low.df, cov_across_nonpeak.df)
return(df)
}) %>% rbindlist() %>%
dplyr::mutate(tf = factor(tf, levels = c('oct4','sox2','nanog','klf4','zic3')))
cov_across_os.mp<-ggplot(cov_across_os.df, aes(x = tss_distance, y = reads, group = interaction(strand, type), color = type))+
geom_line()+
scale_x_continuous(name = 'Distance from motif center(bp)')+
scale_y_continuous(name = 'Experimental data')+
ggtitle('Coverage across OS motifs')+
facet_grid(tf ~ ., scales = 'free')+
theme_classic()
cov_across_os.mp
ggsave(paste0(figure_filepath, '/os_vs_coverage_metapeak.plot.png'), cov_across_os.mp, height = 16, width = 6)
ggsave(paste0(figure_filepath, '/os_vs_coverage_metapeak.plot.pdf'), cov_across_os.mp, height = 16, width = 6)
For comparison, plot coverage at the Nanog motifs.
isolated_nanog_motifs.gr<-motifs.df %>%
dplyr::left_join(., regions.df %>% dplyr::select(region_id, island_content)) %>%
dplyr::filter(motif=='Nanog') %>%
dplyr::filter(island_content=='Nanog:1') %>%
makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start',
end.field = 'end', strand.field = 'strand', starts.in.df.are.0based = T) %>%
GenomicRanges::resize(., 1, 'center')
cov_across_n.df<-mclapply(c('oct4','sox2','klf4','nanog','zic3'), function(x){
message(x)
#Filter to not contain any peak of TF that you're plotting across, avoiding direct profiles
filtered.gr<-isolated_nanog_motifs.gr %>%
plyranges::filter(!overlapsAny(., rtracklayer::import(paste0('narrowpeak/mesc_', x, '_nexus_peaks.bed'))))
cov_across_n_high.df<-filtered.gr %>% plyranges::filter(seq_match_quantile>.5) %>%
exo_metapeak(., sample = cov.bw.list[[x]], upstream = 150, downstream = 151) %>%
dplyr::mutate(type = 'cov across N:1 high',
tf = x)
cov_across_n_low.df<-filtered.gr %>% plyranges::filter(seq_match_quantile<=.5) %>%
exo_metapeak(., sample = cov.bw.list[[x]], upstream = 150, downstream = 151) %>%
dplyr::mutate(type = 'cov across N:1 low',
tf = x)
cov_across_nonpeak.df<-nonpeak_of_nanog.gr %>%
exo_metapeak(., sample = cov.bw.list[[x]], upstream = 150, downstream = 151) %>%
dplyr::mutate(type = 'cov across nonpeak',
tf = x)
df<-rbind(cov_across_n_high.df, cov_across_n_low.df, cov_across_nonpeak.df)
return(df)
}, mc.cores = 6) %>% rbindlist() %>%
dplyr::mutate(tf = factor(tf, levels = c('oct4','sox2','nanog','klf4','zic3')))
cov_across_n.mp<-ggplot(cov_across_n.df, aes(x = tss_distance, y = reads, group = interaction(strand, type), color = type))+
geom_line()+
scale_x_continuous(name = 'Distance from motif center(bp)')+
scale_y_continuous(name = 'Nanog ChIP-nexus')+
ggtitle('Nanog coverage across Nanog motifs')+
facet_grid(tf ~ ., scales = 'free')+
theme_classic()
cov_across_n.mp
ggsave(paste0(figure_filepath, '/n_vs_coverage_metapeak.plot.png'), cov_across_n.mp, height = 20, width = 6)
ggsave(paste0(figure_filepath, '/n_vs_coverage_metapeak.plot.pdf'), cov_across_n.mp, height = 20, width = 6)
For the purposes of reproducibility, the R/Bioconductor session information is printed below:
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 8.9 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so; LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Chicago
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggseqlogo_0.2
## [2] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
## [3] GenomicFeatures_1.56.0
## [4] AnnotationDbi_1.66.0
## [5] Biobase_2.64.0
## [6] BSgenome.Mmusculus.UCSC.mm10_1.4.3
## [7] BSgenome_1.72.0
## [8] BiocIO_1.14.0
## [9] viridis_0.6.5
## [10] viridisLite_0.4.2
## [11] digest_0.6.37
## [12] pander_0.6.6
## [13] lattice_0.22-6
## [14] testit_0.13
## [15] readr_2.1.5
## [16] patchwork_1.3.0
## [17] data.table_1.17.0
## [18] dplyr_1.1.4
## [19] Rsamtools_2.20.0
## [20] plyranges_1.24.0
## [21] reshape2_1.4.4
## [22] ggplot2_3.5.2
## [23] Biostrings_2.72.1
## [24] XVector_0.44.0
## [25] magrittr_2.0.3
## [26] rtracklayer_1.64.0
## [27] GenomicRanges_1.56.2
## [28] GenomeInfoDb_1.40.1
## [29] IRanges_2.38.1
## [30] S4Vectors_0.42.1
## [31] BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 bitops_1.0-9
## [3] gridExtra_2.3 rlang_1.1.4
## [5] matrixStats_1.5.0 compiler_4.4.1
## [7] RSQLite_2.3.9 mgcv_1.9-1
## [9] systemfonts_1.2.3 png_0.1-8
## [11] vctrs_0.6.5 stringr_1.5.1
## [13] pkgconfig_2.0.3 crayon_1.5.3
## [15] fastmap_1.2.0 backports_1.5.0
## [17] labeling_0.4.3 utf8_1.2.4
## [19] rmarkdown_2.29 tzdb_0.4.0
## [21] UCSC.utils_1.0.0 ggbeeswarm_0.7.2
## [23] ragg_1.3.3 purrr_1.0.2
## [25] bit_4.6.0 xfun_0.51
## [27] zlibbioc_1.50.0 cachem_1.1.0
## [29] jsonlite_1.8.9 blob_1.2.4
## [31] DelayedArray_0.30.1 BiocParallel_1.38.0
## [33] broom_1.0.7 R6_2.5.1
## [35] bslib_0.8.0 stringi_1.8.4
## [37] car_3.1-3 jquerylib_0.1.4
## [39] Rcpp_1.0.14 SummarizedExperiment_1.34.0
## [41] knitr_1.50 splines_4.4.1
## [43] Matrix_1.7-0 tidyselect_1.2.1
## [45] rstudioapi_0.17.1 abind_1.4-8
## [47] yaml_2.3.10 codetools_0.2-20
## [49] curl_6.2.1 tibble_3.2.1
## [51] plyr_1.8.9 withr_3.0.2
## [53] KEGGREST_1.44.1 evaluate_1.0.3
## [55] ggrastr_1.0.2 ggpubr_0.6.0
## [57] pillar_1.10.2 carData_3.0-5
## [59] MatrixGenerics_1.16.0 generics_0.1.3
## [61] vroom_1.6.5 RCurl_1.98-1.16
## [63] hms_1.1.3 munsell_0.5.1
## [65] scales_1.3.0 glue_1.8.0
## [67] tools_4.4.1 ggsignif_0.6.4
## [69] GenomicAlignments_1.40.0 XML_3.99-0.18
## [71] Cairo_1.6-2 grid_4.4.1
## [73] tidyr_1.3.1 colorspace_2.1-1
## [75] nlme_3.1-164 GenomeInfoDbData_1.2.12
## [77] beeswarm_0.4.0 vipor_0.4.7
## [79] restfulr_0.0.15 Formula_1.2-5
## [81] cli_3.6.3 textshaping_1.0.0
## [83] S4Arrays_1.4.1 gtable_0.3.6
## [85] rstatix_0.7.2 sass_0.4.9
## [87] SparseArray_1.4.8 rjson_0.2.23
## [89] farver_2.1.2 memoise_2.0.1
## [91] htmltools_0.5.8.1 lifecycle_1.0.4
## [93] httr_1.4.7 bit64_4.5.2